This notebook accompanies below paper: XXX
In this notebook, we apply linear regression on FreeSurfer outputs of the PNC study using a robust linear regression technique.
Strucutral MRI of the PNC study were used for this study.
Morphological features of the strucutral images were derived using FreeSurfer toolkit.
FreeSurfer analysis was performed using LONI pipeline on high performance computing of **USC** Mark and Mary Stevens Neuroimaging and Informatics Institute, Keck school of Medicine of **USC**. The outputs were QC'ed and used as the input of this notebook.
author:
Farshid Sepehrband,
Laboratory of Neuro Imaging, USC Mark and Mary Stevens Neuroimaging and Informatics Institute, Keck School of Medicine of USC, University of Southern California, Los Angeles, CA, USA
farshid.sepehrband@loni.usc.edu
@fsepehrband
import numpy as np
import matplotlib.pyplot as plt
import plotly.plotly as py
import pandas as pd
import os
%pylab inline
myDir = '/Volumes/candl/farshid/BDDS/PNC/BDDS_AnatSexDiff'
os.chdir(myDir)
dataset = pd.read_csv('data/pnc_withNames.csv')
## dataset has extra quotation mark in the names, so:
dataset.columns = dataset.columns.str.replace('\'',"")
# remove rows with NaN values
tempo = dataset.isnull().sum(axis=1)
indx_nan = list(np.where(tempo >= 1))
dataset.drop(dataset.index[indx_nan],inplace=True)
# check input dataframe
dataset.head()
As an example we plot estimated total intracranial volume (eTIV) as a function of age across gender. Any given neuroanatomical feature can be plotted here.
neuro = 'EstimatedTotalIntraCranialVol'
# neuro = 'FS_aparc_2009:ctx_lh_G_occipital_middle:L:thickness:'
# sns.set_style("whitegrid")
temp = [col for col in list(dataset) if neuro in col]
for i in 0,1:
x = dataset.loc[dataset['Gender']==i,'Age']/12
y = dataset.loc[dataset['Gender']==i,temp]
plt.scatter(x,y,color='red' if i==1 else 'blue',label='female' if i==1 else 'male')
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.legend(loc='lower left',fontsize=12)
plt.xlabel('Age (yrs)',size=15)
plt.ylabel(r'eTIV ( mm$^3$)',size=15)
plt.show()
This notebook enables accessible reproduction of literature findings. For example,
Ritchie et al reported that the cortical surface are of the right precentral gyrus is significantly different between male and females, even with brain size adjustments.
"... for some regions, especially for surface area (particularly in areas such as the left isthmus of the cingulate gyrus and the right precentral gyrus), males still showed a significantly higher measurement, indicating specific sex differences in the proportional configuration of the cortex, holding brain size equal."
We therefore, selected right precentral gyrus as the imaging derived phenotypes (IDP) of interest for the second example, and looked at the sex difference:
temp
atlas_temp = 'FS_aparc_aseg'
region_temp = 'rh_precentral'
pheno_temp = 'area'
temp = [col for col in list(dataset) if (region_temp in col) & (pheno_temp in col) & (atlas_temp in col)]
# temp = [col for col in list(dataset) if neuro in col]
neuro2 = 'EstimatedTotalIntraCranialVol'
temp2 = [col for col in list(dataset) if neuro2 in col]
compMean = pd.DataFrame(np.empty((2,2)),index=['male','female'],columns=['without eTIV','with eTIV'])
fig = plt.figure(figsize=(12,4))
for i in 0,1:
x = dataset.loc[dataset['Gender']==i,'Age']
y = dataset.loc[dataset['Gender']==i,temp]
ax1 = fig.add_subplot(131)
y.plot(kind='density',ax=ax1,legend=False,label='female' if i==1 else 'male')
if i==0:
compMean.loc['male','without eTIV'] = float(np.mean(y))
else:
compMean.loc['female','without eTIV'] = float(np.mean(y))
# plt.hist(np.nan_to_num(y),color='red' if i==1 else 'blue',label='female' if i==1 else 'male')
plt.legend(('male','female'),loc='upper left')
plt.xticks(fontsize=14,rotation=30)
plt.xlabel(pheno_temp.title()+r'( mm$^2$)',size=14)
plt.ylabel('density functions',size=16)
plt.yticks(fontsize=14)
plt.title(str(temp).split(':')[2],size=14)
eTIV = dataset.loc[dataset['Gender']==i,temp2]
#
y = np.divide(y,eTIV)
ax2 = fig.add_subplot(132)
y.plot(kind='density',ax=ax2,legend=False,label='female' if i==1 else 'male')
plt.legend(('male','female'),loc='upper left')
plt.xlabel(pheno_temp.title()+r'/eTIV ( mm$^{-1}$)',size=14)
plt.xticks(fontsize=14,rotation=30)
plt.yticks(fontsize=14)
plt.ylabel('')
plt.title(str(temp).split(':')[2],size=14)
if i==0:
compMean.loc['male','with eTIV'] = float(np.mean(y))
else:
compMean.loc['female','with eTIV'] = float(np.mean(y))
y = np.divide(dataset.loc[dataset['Gender']==i,temp]**(1/2),eTIV**(1/3))
ax2 = fig.add_subplot(133)
y.plot(kind='density',ax=ax2,legend=False,label='female' if i==1 else 'male')
plt.legend(('male','female'),loc='upper left')
plt.xlabel(pheno_temp.title()+r'$^{1/2}$ / eTIV$^{1/3}$ (unitless)',size=14)
plt.xticks(fontsize=14,rotation=30)
plt.yticks(fontsize=14)
plt.ylabel('')
plt.title(str(temp).split(':')[2],size=14)
if i==0:
compMean.loc['male','with eTIV'] = float(np.mean(y))
else:
compMean.loc['female','with eTIV'] = float(np.mean(y))
plt.show()
compMean
Now we start prepare the data for the regression analysis.
For the ease of use, neuroimaging data are separated from demographic information. The estimated total intracranial volume was also separated to be added as a covariate to the regression analysis.
eTIV_name = [col for col in list(dataset) if 'EstimatedTotalIntraCranialVol:N:volume' in col][0]
temp = [col for col in list(dataset) if col.startswith('FreeSurfer')]
temp.remove(eTIV_name)
nimg = dataset[temp]
temp = [col for col in list(dataset) if 'FreeSurfer' not in col]
temp.remove('Gender')
conf = dataset[temp]
conf = pd.concat([conf, dataset[eTIV_name]],axis=1)
conf.rename(columns = {eTIV_name:'eTIV'}, inplace = True)
pheno = dataset['Gender']
pd.concat([conf,pheno],axis=1).describe()
Here we draw a pairplot of demographic information and eTIV.
import seaborn as sns
fig1 = plt.figure()
test = pd.concat([conf,pheno],axis=1)
ax = sns.PairGrid(test)#, vars=['Age', 'Gender', 'ICV'])
ax.map_diag(plt.hist)
ax.map_offdiag(sns.kdeplot, cmap="Blues_d", n_levels=6);
ax.fig.autofmt_xdate()
plt.show()
For linear regression, following model was used:
$\beta_0 + \beta_1 \times AGE + \beta_2 \times GENDER + \beta_3 \times eTIV + ... = IDP$
IDP: Imaging derived phenotype
It was implemented by minimizing Huber's loss function, using least trimmed squares estimator. See method section of the paper for more info.
PS: Both categorical columns (gender and race) are binary, so no need for dummy coding.
warnings.filterwarnings("ignore",category =RuntimeWarning)
import statsmodels.api as sm
# create an output data frame to store beta value, t-stat, p-value and -log(p-value)
output = pd.DataFrame(np.empty((nimg.shape[1],9)),index=list(nimg))
output.columns = ['software', 'atlas', 'region', 'hemisphere', 'neuro-param',\
'beta','t-stat','p-value','NLogP']
X = pd.concat([conf,pheno], axis=1)
X['intercept'] = np.ones((X.shape[0],1))
for ni in range(nimg.shape[1]):
out_names = output.index[ni].split(':')
output.loc[list(nimg)[ni],'software':'neuro-param'] = out_names[0:5]
y = nimg.iloc[:,ni]
trimmed = sm.RLM(y, X, M=sm.robust.norms.TrimmedMean())
# Four columns that contian only zero values, resulted "singularity error" in robust regression fitting (in measuring covariance).
# These columns are ignored.
try:
results = trimmed.fit(scale_est=sm.robust.scale.HuberScale(), cov="H2")
except:
results = trimmed.fit(scale_est=sm.robust.scale.HuberScale(), cov="H1")
print('Ignoring '+output.index[ni])
# save regression results
output.loc[list(nimg)[ni],'beta'] = results.params['Gender']
output.loc[list(nimg)[ni],'t-stat'] = results.tvalues['Gender']
output.loc[list(nimg)[ni],'p-value'] = results.pvalues['Gender']
output.loc[list(nimg)[ni],'NLogP'] = -np.log10(results.pvalues['Gender'])
output.index=range(nimg.shape[1])
output.beta = np.round(output.beta,decimals=2)
output.NLogP = np.round(output.NLogP,decimals=2)
output['t-stat'] = np.round(output['t-stat'],decimals=2)
output.head()
# define p-value threshold and determine Bonferoni threshold
Pval = 0.01
Pval0 = 0.0001
Bonf = -np.log10(Pval/len(output.index))
Bonf0 = -np.log10(Pval0/len(output.index))
# scatter plot results and draw Bonferoni line
fig = plt.figure(figsize=[20,5],dpi=600)
ax = fig.add_subplot(1, 1, 1,xlim=[-100,2200],facecolor='white')
ax.grid(color='lightgray')
def color_coding(features):
face_color = ['darkred']*len(features)
for i in range(len(features)):
if features[i] == 'area':
face_color[i] = 'purple'
elif features[i] == 'curvind':
face_color[i] = 'dodgerblue'
elif features[i] == 'thickness':
face_color[i] = 'darkblue'
elif features[i] == 'volume':
face_color[i] = 'darkgreen'
elif features[i] == 'gauscurv':
face_color[i] = 'orange'
elif features[i] == 'meancurv':
face_color[i] = 'orangered'
elif features[i] == 'thicknessstd':
face_color[i] = 'lawngreen'
return face_color;
face_color = color_coding(output['neuro-param'])
ax.scatter(range(len(output.index)),output['NLogP'],label='-log(p)',c = face_color,s=50,edgecolor=face_color)
ax.plot([0,2500],[Bonf, Bonf],c='orange',label='Bonferoni')
ax.plot([0,2500],[Bonf0, Bonf0],c='red',label='Bonferoni')
plt.xlabel('Neuroanatomical features',size=20)
plt.ylabel('negative log(p-value)',size=20)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlim((-10,len(output)+10))
# plt.ylim((-0.5,26))
plt.title('Univariate robust linear regression',fontsize=24)
# plt.savefig('output/ManPlot_UniLR_RLM'+'_'+datetime.datetime.now().strftime('%Y-%m-%d'))
plt.show()
Here we performed a logistic regression on 4 most significant coritical surface area statistics of above figure ($-log(p)>15$):
**purple** pints on the left side of the above plot.
The query code for extracting these regions is:
output.loc[(output['neuro-param']=='area') & (output['atlas']=='FS_aparc_2009') & (output['NLogP']>15)]
which returns:
Therefore, we wrote:
$logit(gender)=\beta _{0}+\beta _{1}eTIV+\beta _{2}RACE+\beta _{3}AGE+\sum_{i=1}^{4}\beta _{i+3}IDP_{i}$
temp = output.loc[(output['neuro-param']=='area') & (output['atlas']=='FS_aparc_2009') & (output['NLogP']>15)].index
X_temp = nimg.iloc[:,temp]
name = list()
for i in range(len(list(X_temp))):
name.append('-'.join(list(X_temp)[i].split(':')[2:4]))
X_temp.columns = name
X_multivar = pd.concat([X_temp,conf],axis=1)
X_multivar['intercept']=1
X_multivar
multivariate_test = sm.Logit(pheno, X_multivar).fit(disp=0)
print(multivariate_test.summary())
Note two of the p-values are not significant ($P(\beta_1) = 0.1$ and $P(\beta_3) = 0.14$).
To show the effect of outliers on derived measures, here we compare ordinary least square linear regression with robust linear regression. For better visualization, only age is used as covariates and results are plotted on age.
We also compare few difference approaches of robust fitting.
from statsmodels.sandbox.regression.predstd import wls_prediction_std
# create X and y
X = pd.concat([conf,pheno], axis=1)
X['intercept'] = np.ones((X.shape[0],1))
neuro_interest = 'FreeSurfer:FS_aparc_2009:ctx_rh_G_pariet_inf-Supramar:R:gauscurv:'
y = nimg[neuro_interest]
X.drop(['Race','eTIV','Gender'],axis=1,inplace=True)
# performing different fittings
huber_t = sm.RLM(y, X, M=sm.robust.norms.HuberT())
OLS = sm.OLS(y,X).fit()
hub_results = huber_t.fit(cov="H2")
andrew = sm.RLM(y, X, M=sm.robust.norms.AndrewWave())
andrew_results = andrew.fit(scale_est=sm.robust.scale.HuberScale(), cov="H3")
trimme = sm.RLM(y, X, M=sm.robust.norms.TrimmedMean())
trimme_results = trimme.fit(scale_est=sm.robust.scale.HuberScale(), cov="H3")
# gather and print results
compare = pd.concat([OLS.params,hub_results.params,andrew_results.params,trimme_results.params],axis=1)
compare.columns = ['OLS','HuberT','Wave','Trimmed']
print(compare)
# Plotting results
prstd, iv_l, iv_u = wls_prediction_std(OLS)
fig = plt.figure(figsize=(18,6),facecolor='white',dpi=600)
ax = fig.add_subplot(121)
ax.plot(X['Age']/12, y, 'o',label="data")
ax.plot(X['Age']/12, iv_u, 'r--',alpha = 0.3)
ax.plot(X['Age']/12, iv_l, 'r--',alpha = 0.3)
ax.plot(X['Age']/12, hub_results.fittedvalues, 'k.-', label="MLE - H2")
# ax.plot(X['Age']/12, andrew_results.fittedvalues, 'c.-', label="Wave - H2")
ax.plot(X['Age']/12, andrew_results.fittedvalues, 'c.-', label="Trimmed - H2")
ax.plot(X['Age']/12, OLS.fittedvalues, 'r.-', label="OLS")
ax.legend(loc="best",fontsize='medium')
plt.ylabel(neuro_interest.split(':')[4],size=20)
plt.xlabel('Age',size=15)
plt.title(' '.join(neuro_interest.split(':')[2:4]),size=20)
ax = fig.add_subplot(122)
ax.plot(X['Age']/12, y, 'o',label="data")
ax.plot(X['Age']/12, iv_u, 'r--',alpha = 0.3)
ax.plot(X['Age']/12, iv_l, 'r--',alpha = 0.3)
ax.plot(X['Age']/12, hub_results.fittedvalues, 'k.-', label="MLE - H2")
# ax.plot(X['Age']/12, andrew_results.fittedvalues, 'c.-', label="Wave - H3")
ax.plot(X['Age']/12, andrew_results.fittedvalues, 'c.-', label="Trimmed - H2")
ax.plot(X['Age']/12, OLS.fittedvalues, 'r.-', label="OLS")
ax.legend(loc="best",fontsize='medium')
plt.xlabel('Age',size=15)
plt.ylim((-0.1,0.4))
plt.title('Zoomed view',size=20)
plt.show()
from IPython.display import Image
Image(filename='demo/demoFig.png')
from IPython.display import YouTubeVideo
vid = YouTubeVideo("https://youtu.be/QjyAV9QPiXU")
display(vid)
Image(filename='demo/bubble.jpg')